summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFernando Sahmkow <fsahmkow27@gmail.com>2023-08-20 17:53:08 +0200
committerFernando Sahmkow <fsahmkow27@gmail.com>2023-09-23 23:05:30 +0200
commitc8237d5c312485394389b2520451ef720604ea9a (patch)
tree1a1064ed38a7a53bd61e4c04bf4571cdebfce2ec
parentQuery Cache: Fix guest side sample counting (diff)
downloadyuzu-c8237d5c312485394389b2520451ef720604ea9a.tar
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.gz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.bz2
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.lz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.xz
yuzu-c8237d5c312485394389b2520451ef720604ea9a.tar.zst
yuzu-c8237d5c312485394389b2520451ef720604ea9a.zip
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/queries_prefix_scan_sum.comp124
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp110
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h14
-rw-r--r--src/video_core/renderer_vulkan/vk_query_cache.cpp147
5 files changed, 348 insertions, 48 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index fb24b6532..8218ec4c8 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SHADER_FILES
pitch_unswizzle.comp
present_bicubic.frag
present_gaussian.frag
+ queries_prefix_scan_sum.comp
resolve_conditional_render.comp
smaa_edge_detection.vert
smaa_edge_detection.frag
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
new file mode 100644
index 000000000..dce1279fe
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
+// SPDX-License-Identifier: MIT
+
+// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
+// Nicholas Haemel. Modified to suit needs and optimize for subgroup
+
+#version 460 core
+
+#ifdef VULKAN
+
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS \
+ } \
+ ;
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uint max_accumulation_base;
+UNIFORM(1) uint accumulation_limit;
+END_PUSH_CONSTANTS
+
+layout(local_size_x = 32) in;
+
+layout(std430, binding = 0) readonly buffer block1 {
+ uvec2 input_data[gl_WorkGroupSize.x];
+};
+
+layout(std430, binding = 1) writeonly coherent buffer block2 {
+ uvec2 output_data[gl_WorkGroupSize.x];
+};
+
+layout(std430, binding = 2) coherent buffer block3 {
+ uvec2 accumulated_data;
+};
+
+shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
+
+uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
+ uint carry = 0;
+ uvec2 result;
+ result.x = uaddCarry(value_1.x, value_2.x, carry);
+ result.y = value_1.y + value_2.y + carry;
+ return result;
+}
+
+void main(void) {
+ uint id = gl_LocalInvocationID.x;
+ uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
+ uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
+ uint work_size = gl_WorkGroupSize.x;
+ uint rd_id;
+ uint wr_id;
+ uint mask;
+ uvec2 input_1 = input_data[id * 2];
+ uvec2 input_2 = input_data[id * 2 + 1];
+ // The number of steps is the log base 2 of the
+ // work group size, which should be a power of 2
+ const uint steps = uint(log2(work_size)) + 1;
+ uint step = 0;
+
+ // Each invocation is responsible for the content of
+ // two elements of the output array
+ shared_data[id * 2] = input_1;
+ shared_data[id * 2 + 1] = input_2;
+ // Synchronize to make sure that everyone has initialized
+ // their elements of shared_data[] with data loaded from
+ // the input arrays
+ barrier();
+ memoryBarrierShared();
+ // For each step...
+ for (step = 0; step < steps; step++) {
+ // Calculate the read and write index in the
+ // shared array
+ mask = (1 << step) - 1;
+ rd_id = ((id >> step) << (step + 1)) + mask;
+ wr_id = rd_id + 1 + (id & mask);
+ // Accumulate the read data into our element
+
+ shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
+ // Synchronize again to make sure that everyone
+ // has caught up with us
+ barrier();
+ memoryBarrierShared();
+ }
+ // Add the accumulation
+ shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
+ shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
+ barrier();
+ memoryBarrierShared();
+
+ // Finally write our data back to the output buffer
+ output_data[id * 2] = shared_data[id * 2];
+ output_data[id * 2 + 1] = shared_data[id * 2 + 1];
+ if (id == 0) {
+ if (max_accumulation_base >= accumulation_limit + 1) {
+ accumulated_data = shared_data[accumulation_limit];
+ return;
+ }
+ uvec2 value_1 = shared_data[max_accumulation_base];
+ uvec2 value_2 = shared_data[accumulation_limit];
+ accumulated_data = AddUint64(value_1, -value_2);
+ }
+} \ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 039dc95e1..a1af08cda 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -12,6 +12,7 @@
#include "common/common_types.h"
#include "common/div_ceil.h"
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
+#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
@@ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
},
}};
+constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
+ {
+ .binding = 0,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .pImmutableSamplers = nullptr,
+ },
+ {
+ .binding = 1,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .pImmutableSamplers = nullptr,
+ },
+ {
+ .binding = 2,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .descriptorCount = 1,
+ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+ .pImmutableSamplers = nullptr,
+ },
+}};
+
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 2,
@@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.score = 2,
};
+constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
+ .uniform_buffers = 0,
+ .storage_buffers = 3,
+ .texture_buffers = 0,
+ .image_buffers = 0,
+ .textures = 0,
+ .images = 0,
+ .score = 3,
+};
+
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
{
.binding = ASTC_BINDING_INPUT_BUFFER,
@@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
.stride = sizeof(DescriptorUpdateEntry),
};
+constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
+ .dstBinding = 0,
+ .dstArrayElement = 0,
+ .descriptorCount = 3,
+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .offset = 0,
+ .stride = sizeof(DescriptorUpdateEntry),
+};
+
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{
@@ -132,6 +176,11 @@ struct AstcPushConstants {
u32 block_height;
u32 block_height_mask;
};
+
+struct QueriesPrefixScanPushConstants {
+ u32 max_accumulation_base;
+ u32 accumulation_limit;
+};
} // Anonymous namespace
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
@@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
u32 src_offset, bool compare_to_zero) {
- scheduler.RequestOutsideRenderPassOperationContext();
-
const size_t compare_size = compare_to_zero ? 8 : 24;
compute_pass_descriptor_queue.Acquire();
@@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
- .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
static constexpr VkMemoryBarrier write_barrier{
@@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
});
}
+QueriesPrefixScanPass::QueriesPrefixScanPass(
+ const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
+ ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
+ : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
+ QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
+ COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
+ QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
+ scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
+
+void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
+ VkBuffer src_buffer, size_t number_of_sums,
+ size_t max_accumulation_limit) {
+ size_t aligned_runs = Common::AlignUp(number_of_sums, 32);
+
+ compute_pass_descriptor_queue.Acquire();
+ compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64));
+ compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64));
+ compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
+ const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
+
+ scheduler.RequestOutsideRenderPassOperationContext();
+ scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums,
+ aligned_runs](vk::CommandBuffer cmdbuf) {
+ static constexpr VkMemoryBarrier read_barrier{
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+ };
+ static constexpr VkMemoryBarrier write_barrier{
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
+ VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
+ VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
+ VK_ACCESS_UNIFORM_READ_BIT |
+ VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
+ };
+ const QueriesPrefixScanPushConstants uniforms{
+ .max_accumulation_base = static_cast<u32>(max_accumulation_limit),
+ .accumulation_limit = static_cast<u32>(number_of_sums - 1),
+ };
+ const VkDescriptorSet set = descriptor_allocator.Commit();
+ device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
+
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
+ cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
+ cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
+ cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
+ cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1);
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+ VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
+ });
+}
+
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index c62f30d30..e6ff86e9a 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -95,6 +95,20 @@ private:
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
+class QueriesPrefixScanPass final : public ComputePass {
+public:
+ explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
+ DescriptorPool& descriptor_pool_,
+ ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
+
+ void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
+ size_t number_of_sums, size_t max_accumulation_limit);
+
+private:
+ Scheduler& scheduler;
+ ComputePassDescriptorQueue& compute_pass_descriptor_queue;
+};
+
class ASTCDecoderPass final : public ComputePass {
public:
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index 2147776f8..ded190ae0 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -11,6 +11,7 @@
#include <utility>
#include <vector>
+#include "common/bit_util.h"
#include "common/common_types.h"
#include "core/memory.h"
#include "video_core/engines/draw_manager.h"
@@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer {
public:
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
- Scheduler& scheduler_, const MemoryAllocator& memory_allocator_)
+ Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
+ ComputePassDescriptorQueue& compute_pass_descriptor_queue,
+ DescriptorPool& descriptor_pool)
: BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
scheduler{scheduler_}, memory_allocator{memory_allocator_} {
- BuildResolveBuffer();
current_bank = nullptr;
current_query = nullptr;
ammend_value = 0;
acumulation_value = 0;
+ queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
+ device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
+
+ const VkBufferCreateInfo buffer_ci = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = nullptr,
+ .flags = 0,
+ .size = 8,
+ .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .pQueueFamilyIndices = nullptr,
+ };
+ accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
+ scheduler.RequestOutsideRenderPassOperationContext();
+ scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
+ cmdbuf.FillBuffer(buffer, 0, 8, 0);
+ });
}
~SamplesStreamer() = default;
@@ -159,6 +180,8 @@ public:
acumulation_value = 0;
});
rasterizer->SyncOperation(std::move(func));
+ accumulation_since_last_sync = false;
+ last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used);
}
void CloseCounter() override {
@@ -175,7 +198,8 @@ public:
}
for (size_t i = 0; i < sync_values_stash.size(); i++) {
- runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]);
+ runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
+ *buffers[resolve_buffers[i]]);
}
sync_values_stash.clear();
@@ -189,36 +213,21 @@ public:
sync_values_stash.clear();
sync_values_stash.emplace_back();
std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
- sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
+ sync_values->reserve(num_slots_used);
std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
- size_t this_bank_slot = std::numeric_limits<size_t>::max();
- size_t resolve_slots_remaining = resolve_slots;
- size_t resolve_buffer_index = 0;
+ resolve_buffers.clear();
+ size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
+ resolve_buffers.push_back(resolve_buffer_index);
+ size_t base_offset = 0;
+
ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
size_t amount) {
size_t bank_id = bank->GetIndex();
- if (this_bank_slot != bank_id) {
- this_bank_slot = bank_id;
- if (resolve_slots_remaining == 0) {
- resolve_buffer_index++;
- if (resolve_buffer_index >= resolve_buffers.size()) {
- BuildResolveBuffer();
- }
- resolve_slots_remaining = resolve_slots;
- sync_values_stash.emplace_back();
- sync_values = &sync_values_stash.back();
- sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
- }
- resolve_slots_remaining--;
- }
- auto& resolve_buffer = resolve_buffers[resolve_buffer_index];
- const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
- (resolve_slots - resolve_slots_remaining - 1);
+ auto& resolve_buffer = buffers[resolve_buffer_index];
VkQueryPool query_pool = bank->GetInnerPool();
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([start, amount, base_offset, query_pool,
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
- size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
const VkBufferMemoryBarrier copy_query_pool_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
@@ -227,39 +236,60 @@ public:
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = buffer,
- .offset = final_offset,
+ .offset = base_offset,
.size = amount * SamplesQueryBank::QUERY_SIZE,
};
cmdbuf.CopyQueryPoolResults(
query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
- static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE,
+ static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
});
- offsets[bank_id] = {sync_values_stash.size() - 1, base_offset};
+ offsets[bank_id] = {start, base_offset};
+ base_offset += amount * SamplesQueryBank::QUERY_SIZE;
});
// Convert queries
+ bool has_multi_queries = false;
for (auto q : pending_sync) {
auto* query = GetQuery(q);
+ size_t sync_value_slot = 0;
if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
continue;
}
if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
continue;
}
- if (query->size_slots > 1) {
- // This is problematic.
- // UNIMPLEMENTED();
+ if (accumulation_since_last_sync || query->size_slots > 1) {
+ if (!has_multi_queries) {
+ has_multi_queries = true;
+ sync_values_stash.emplace_back();
+ }
+ sync_value_slot = 1;
}
query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
auto loc_data = offsets[query->start_bank_id];
- sync_values_stash[loc_data.first].emplace_back(HostSyncValues{
+ sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
.address = query->guest_address,
.size = SamplesQueryBank::QUERY_SIZE,
- .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE,
+ .offset =
+ loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
+ SamplesQueryBank::QUERY_SIZE,
+ });
+ }
+
+ if (has_multi_queries) {
+ size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
+ resolve_buffers.push_back(intermediary_buffer_index);
+ queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
+ *buffers[resolve_buffer_index], num_slots_used,
+ std::min(last_accumulation_checkpoint, num_slots_used));
+ } else {
+ scheduler.RequestOutsideRenderPassOperationContext();
+ scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
+ cmdbuf.FillBuffer(buffer, 0, 8, 0);
});
}
@@ -267,6 +297,9 @@ public:
std::function<void()> func([this] { ammend_value = acumulation_value; });
rasterizer->SyncOperation(std::move(func));
AbandonCurrentQuery();
+ num_slots_used = 0;
+ last_accumulation_checkpoint = std::numeric_limits<size_t>::max();
+ accumulation_since_last_sync = has_multi_queries;
pending_sync.clear();
}
@@ -400,6 +433,7 @@ private:
void ReserveHostQuery() {
size_t new_slot = ReserveBankSlot();
current_bank->AddReference(1);
+ num_slots_used++;
if (current_query) {
size_t bank_id = current_query->start_bank_id;
size_t banks_set = current_query->size_banks - 1;
@@ -470,32 +504,50 @@ private:
});
}
- void BuildResolveBuffer() {
+ template <bool is_resolve>
+ size_t ObtainBuffer(size_t num_needed) {
+ const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed));
+ if constexpr (is_resolve) {
+ if (resolve_table[log_2] != 0) {
+ return resolve_table[log_2] - 1;
+ }
+ } else {
+ if (intermediary_table[log_2] != 0) {
+ return intermediary_table[log_2] - 1;
+ }
+ }
const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
- .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots,
+ .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
- resolve_buffers.emplace_back(
- memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
+ buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
+ if constexpr (is_resolve) {
+ resolve_table[log_2] = buffers.size();
+ } else {
+ intermediary_table[log_2] = buffers.size();
+ }
+ return buffers.size() - 1;
}
- static constexpr size_t resolve_slots = 8;
-
QueryCacheRuntime& runtime;
VideoCore::RasterizerInterface* rasterizer;
const Device& device;
Scheduler& scheduler;
const MemoryAllocator& memory_allocator;
VideoCommon::BankPool<SamplesQueryBank> bank_pool;
- std::deque<vk::Buffer> resolve_buffers;
+ std::deque<vk::Buffer> buffers;
+ std::array<size_t, 32> resolve_table{};
+ std::array<size_t, 32> intermediary_table{};
+ vk::Buffer accumulation_buffer;
std::deque<std::vector<HostSyncValues>> sync_values_stash;
+ std::vector<size_t> resolve_buffers;
// syncing queue
std::vector<size_t> pending_sync;
@@ -510,10 +562,14 @@ private:
SamplesQueryBank* current_bank;
VkQueryPool current_query_pool;
size_t current_query_id;
+ size_t num_slots_used{};
+ size_t last_accumulation_checkpoint{};
+ bool accumulation_since_last_sync{};
VideoCommon::HostQueryBase* current_query;
bool has_started{};
- bool current_unset{};
std::mutex flush_guard;
+
+ std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
};
// Transform feedback queries
@@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl {
memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
guest_streamer(0, runtime),
sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
- device, scheduler, memory_allocator),
+ device, scheduler, memory_allocator, compute_pass_descriptor_queue,
+ descriptor_pool),
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
scheduler, memory_allocator, staging_pool),
primitives_succeeded_streamer(
@@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
return true;
}
}
- if (!is_in_bc[0] && !is_in_bc[1]) {
+ /*if (!is_in_bc[0] && !is_in_bc[1]) {
// Both queries are in query cache, it's best to just flush.
- return false;
- }
+ return true;
+ }*/
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
return true;
}